{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "83b5cb73",
   "metadata": {},
   "source": [
    "### Parsing All VEP predicted effects "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "a467b2d9",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from pathlib import Path "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "b019f457",
   "metadata": {},
   "outputs": [],
   "source": [
    "wkdir=\"/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3\"\n",
    "wkdir_path = Path(wkdir)\n",
    "\n",
    "path_out = wkdir_path.joinpath(\"4_vrnt_enrich/sv_vep/all/SV_vep_hg38_all_parsed.tsv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "5d056340",
   "metadata": {},
   "outputs": [],
   "source": [
    "columns = [\"Uploaded_variation\", \"Location\", \"Allele\", \"Gene\", \"Feature\", \"Feature_type\", \n",
    "           \"Consequence\", \"cDNA_position\", \"CDS_position\", \"Protein_position\", \"Amino_acids\", \n",
    "           \"Codons\", \"Existing_variation\", \"Extra\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "ceae8d88",
   "metadata": {},
   "outputs": [],
   "source": [
    "vep_consq_all_path = wkdir_path.joinpath(\"4_vrnt_enrich/sv_vep/all/SV_vep_hg38_all_output.vcf.gz\")\n",
    "vep_consq_dict, count = {}, 0\n",
    "with open(vep_consq_all_path, \"r\") as vep_in: \n",
    "    for line in vep_in: \n",
    "        if line.startswith(\"#\"): \n",
    "            continue            \n",
    "        else: \n",
    "            vep_consq_dict[count] = line.split(\"\\t\")\n",
    "            count += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "a77c09ff",
   "metadata": {},
   "outputs": [],
   "source": [
    "vep_all_consq_df = pd.DataFrame.from_dict(vep_consq_dict, orient=\"index\", columns=columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "ec9ef3ba",
   "metadata": {},
   "outputs": [],
   "source": [
    "rename_columns_dict = {\"Uploaded_variation\": \"vrnt_id\", \n",
    "                       \"Gene\": \"gene_id\"}\n",
    "\n",
    "vep_all_path_out = wkdir_path.joinpath(\"4_vrnt_enrich/sv_vep/all/SV_vep_hg38_all_parsed.tsv\")\n",
    "vep_all_consq_renamed_df = vep_all_consq_df.rename(columns=rename_columns_dict)\n",
    "vep_all_consq_renamed_df.to_csv(vep_all_path_out, sep=\"\\t\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3a89f3df",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
